library(dplyr)
library(ggplot2)
library(readr)
library(cowplot)
library(NanoStringQCPro)
infn = "../data/metadata-hepb.csv"
metadata = read_csv(infn)
## Warning: Missing column names filled in: 'X13' [13]
## Parsed with column specification:
## cols(
##   rowname = col_integer(),
##   `Sample Name` = col_character(),
##   `RCC File Name` = col_character(),
##   `Clinical Group` = col_integer(),
##   Month = col_character(),
##   `Plate+Sample` = col_character(),
##   `Spike In` = col_character(),
##   `Date Extracted` = col_character(),
##   `Who Extracted` = col_character(),
##   Comments = col_character(),
##   `Date Shipped` = col_character(),
##   `RCC File Date` = col_character(),
##   X13 = col_character()
## )

The metadata file needs some cleaning up so we’ll do that here. First we’ll remove spaces and drop columns with no meaning.

cnames = c("rownumber", "sample", "filename", "diseaseclass", "month", "plate_sample",
           "spikein", "extract_date", "extractor", "comments", "ship_date",
           "file_date", "empty")
colnames(metadata) = cnames
metadata = metadata %>% select(-empty, -rownumber)
metadata$diseaseclass = as.factor(metadata$diseaseclass)

Samplenames and metadata look unique:

nrow(metadata) == length(unique(metadata$sample))
## [1] TRUE
nrow(metadata) == length(unique(metadata$filename))
## [1] TRUE
class = as.factor(c(1,2,3,4,5,6,7))
phenotype = c("acute", "positive_tolerant", "positive_active",
              "negative_inactive", "negative_chronic",
              "negative_unknown", "spontaneous")
active = c("active", "inactive", "active", "inactive", "active",
           "unknown", "inactive")
classdata = data.frame(diseaseclass=class, phenotype=phenotype, active=active)
metadata = metadata %>% left_join(classdata, by="diseaseclass")
metadata$es = grepl("ES", metadata$extractor)
metadata$cm = grepl("CM", metadata$extractor)
metadata$ed = grepl("ED", metadata$extractor)
metadata$zymo = grepl("zymo", metadata$extractor)
metadata$active = ifelse(metadata$diseaseclass %in% c(1,3,5), "active",
                         "inactive")
metadata$active = ifelse(metadata$diseaseclass %in% c(1,3,5), "active",
                         "inactive")

Load data

rccFiles = paste("../data/rcc/", metadata$filename, sep="")
eset = newRccSet(rccFiles=rccFiles)
## Reading RCC files...
## checkRccSet() messages:
##   Optional featureData cols not found: BarCode, ProbeID, TargetSeq, Species, Comments
##   The following panel housekeeping genes were found: B2M|0, GAPDH|0, RPL19|0, ACTB|0, RPLP0|0
## Warning in .local(rccSet, ...): 
##   Non-standard CodeClass values found in featureData (values currently recognized are "Endogenous", "Housekeeping", "Positive", and "Negative"): Ligation, Endogenous1, SpikeIn)
colnames(eset) = metadata$sample

Scale miRNA with known crosstalk

A small set of miRNA have known inflated values; Nanostring has a heuristic to correct for the inflation via the formula using some supplied constants. Here we implement that fix to correct the expression of those miRNA.

plot_corrected_miRNA = function(scaled, eset) {
  require(reshape)
  require(ggplot2)
  counts = exprs(eset)
  rownames(counts) = rownames(scaled)
  is_scaled = rowSums(abs(counts - scaled)) > 0
  counts = melt(as.matrix(counts[is_scaled,]))
  colnames(counts) = c("gene", "sample", "value")
  counts$scaled = "no"
  scaled = melt(as.matrix(scaled[is_scaled,]))
  colnames(scaled) = c("gene", "sample", "value")
  scaled$scaled = "yes"
  all = rbind(counts, scaled)
  library(cowplot)
  ggplot(all, aes(sample, value, color=scaled)) +
    facet_wrap(~gene, scale="free_y") +
    scale_y_sqrt() +
    geom_point(size=0.5) +
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
    guides(colour = guide_legend(override.aes = list(size=6))) + ylab("") +
    ggtitle("miRNA hybridization crosstalk correction")
}

correct_miRNA = function(eset, plot=TRUE) {
  # scales specific features tagged known to have crosstalk by
  # the Nanostring provided scaling factor
  require(dplyr)
  fdat = fData(eset)
  fdat = fdat %>%
    tidyr::separate(GeneName, c("Name", "scalefactor"), sep='\\|',
                    fill="right", remove=FALSE) %>%
    dplyr::mutate(scalefactor=ifelse(is.na(scalefactor), 0, scalefactor))
  sf = as.numeric(fdat$scalefactor)
  posa = as.numeric(exprs(eset)["Positive_POS_A_ERCC_00117.1",])
  scales = t(replicate(length(sf), posa))
  scales = scales * sf
  scaled = exprs(eset) - scales
  scaled[scaled<0] = 0
  print(plot_corrected_miRNA(scaled, eset))
  exprs(eset) = round(scaled)
  return(eset)
}
eset = correct_miRNA(eset)

Imaging QC

This percentage should be super high, close to 100%. If there isn’t that usually means the cartridge needs to be rescanned. If it is < 3 weeks from the scan, rescanning should be okay. Nanostring folks call < 75% a failure. These all look fine.

plotFOV = function(eset, metadata) {
  pdat = pData(eset) %>%
    tibble::rownames_to_column() %>%
    left_join(metadata, by=c("rowname"="sample"))
  pdat$pcounted = pdat$FovCounted/pdat$FovCount * 100
  ggplot(pdat, aes(rowname, pcounted)) + geom_point() +
      theme(axis.text.x=element_blank(),
            axis.ticks.x=element_blank(),
            strip.text.x=element_text(size=8)) +
    scale_y_continuous(expand = c(0,0)) +
    expand_limits(y = c(0,1.05 * max(pdat$pcounted))) +
    ylab("percentage of FOV counted") + xlab("sample") +
    geom_hline(yintercept=75, color="red")
}
plotFOV(eset, metadata)

Binding density

Binding density looks ok.

plotBD = function(eset, metadata) {
  pdat = pData(eset) %>%
    tibble::rownames_to_column() %>%
    left_join(metadata, by=c("rowname"="sample"))
  pdat$pcounted = pdat$FovCounted/pdat$FovCount * 100
  ggplot(pdat, aes(rowname, BindingDensity)) + geom_point() +
      theme(axis.text.x=element_blank(),
            axis.ticks.x=element_blank(),
            strip.text.x=element_text(size=8)) +
    scale_y_continuous(expand = c(0,0)) +
    expand_limits(y = c(0,1.05 * max(pdat$BindingDensity))) +
    ylab("Binding density") + xlab("sample") +
    geom_hline(yintercept=0.05, color="red") +
    geom_hline(yintercept=2.25, color="red")
}
plotBD(eset, metadata)

Total counts vs miRNA detected

Here I plotted the total counts vs the number of miRNA detected, where detected is it has counts > 30. There is a pretty big spread among the samples in terms of the miRNA detected. The Zymo columns seem to all have a low number of miRNA detected overall and for a given total mber of counts, a smaller number of miRNA detected. This indicates the samples using the Zymo columns are less complex than the non-Zymo samples.

plotComplexity = function(eset, metadata) {
  counts = exprs(eset)
  endocounts = counts[grepl("Endo", rownames(counts)),]
  cdf = data.frame(total=colSums(counts), detected=colSums(counts > 10))
  rownames(cdf) = colnames(counts)
  cdf$sample = rownames(cdf)
  cdf = cdf %>% left_join(metadata, by="sample")
  ggplot(cdf, aes(total, detected, color=zymo, shape=es)) + geom_point()
}
plotComplexity(eset, metadata)

library(ggplot2)
library(dplyr)
library(cowplot)
is_positive = function(column) {
  return(grepl("Pos", column))
}
is_negative = function(column) {
  return(grepl("Neg", column))
}
is_spikein = function(column) {
  return(grepl("Spike", column))
}
is_ligation = function(column) {
  return(grepl("Ligati", column))
}
is_housekeeping = function(column) {
  return(grepl("Housekee", column))
}
is_prior = function(column) {
  return(grepl("miR-159", column) | grepl("miR-248", column) |
         grepl("miR-254", column))
}

extract_pred = function(eset, predicate, counts=FALSE) {
  if(!counts) {
    counts = data.frame(exprs(eset))
  } else {
    counts = eset
    }
  toplot = counts[predicate(rownames(counts)),] %>%
    tibble::rownames_to_column() %>%
    tidyr::gather("sample", "count", -rowname)
  colnames(toplot) = c("spot", "sample", "count")
  toplot = toplot %>% left_join(metadata, by="sample")
  return(toplot)
}
spotbarplot = function(toplot) {
  ggplot(toplot,
        aes(sample, count)) + geom_bar(stat='identity') +
    facet_wrap(~spot) +
    theme(axis.text.x = element_blank(),
          text = element_text(size=8))
}
spotboxplot = function(toplot) {
  ggplot(toplot,
        aes(linehypo, count)) + geom_boxplot() +
    facet_wrap(~spot) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

Positive controls

Below we look at the R^2 correlation between the expected positive control concentrations and the observed concentrations for each sample.

A set of samples have a lower correlation than the other samples.

spotbarplot(extract_pred(eset, is_positive))

posR2 = function(eset) {
  pcdf = data.frame(concentration=log2(c(128, 32, 8, 2, 0.5, 0.125)),
                    GeneName=paste("POS", c("A", "B", "C", "D", "E", "F"), sep="_"))
  pccounts = subset(exprs(eset), grepl("Positive_POS", rownames(exprs(eset))))
  pccounts = pccounts[sort(rownames(pccounts)),]
  rownames(pccounts) = pcdf$GeneName
  corsamples = data.frame(correlation=apply(pccounts, 2,
                                            function(x) cor(x, pcdf$concentration)),
                          sample=colnames(pccounts)) %>%
    left_join(metadata, by="sample")
  ggplot(corsamples, aes(sample, correlation)) + geom_point() +
      theme(axis.text.x=element_blank(),
            axis.ticks.x=element_blank(),
            strip.text.x=element_text(size=8)) +
    scale_y_continuous(expand = c(0,0)) +
    expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
    ylab("positive control correlation") +
    xlab("sample")
  return(corsamples)
}
corsamples = posR2(eset)
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector

Negative controls

We can see some samples have a higher negative control count than the other samples.

spotbarplot(extract_pred(eset, is_negative))

knitr::kable(subset(extract_pred(eset, is_negative), count > 50))
spot sample count filename diseaseclass month plate_sample spikein extract_date extractor comments ship_date file_date phenotype active es cm ed zymo
## Spik eIn
We have two diff erent se ts of spike ins (OLD/NEW)? But the y don’t look mu ch
differe nt from e ach othe r:
spotbarplot(extract_pred(eset, is_spikein))

spikebarplot = function(toplot) {
  ggplot(toplot,
        aes(sample, count, fill=spikein)) + geom_bar(stat='identity') +
    facet_wrap(~spot) +
    theme(axis.text.x = element_blank(),
          text = element_text(size=8))
}

Ligation

There is quite a bit of variability in the ligation controls for the samples:

ligboxplot = function(toplot) {
  ggplot(toplot,
         aes(phenotype, count)) + geom_boxplot() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    facet_wrap(~spot)
}
ligboxplot(extract_pred(eset, is_ligation))

Ligation control R^2

There are some samples with a pretty bad R2 for the ligation control. We will drop these.

ligR2 = function(eset) {
  pcdf = data.frame(concentration=log2(c(128, 32, 8)),
                    GeneName=paste("POS_", c("A", "B", "C"), sep="_"))
  pccounts = subset(exprs(eset), grepl("Ligation_LIG_POS", rownames(exprs(eset))))
  pccounts = pccounts[sort(rownames(pccounts)),]
  rownames(pccounts) = pcdf$GeneName
  corsamples = data.frame(correlation=apply(pccounts, 2,
                                            function(x) cor(x, pcdf$concentration)),
                          sample=colnames(pccounts)) %>%
    left_join(metadata, by="sample")
  print(ggplot(corsamples, aes(sample, correlation)) + geom_point() +
      theme(axis.text.x=element_blank(),
            axis.ticks.x=element_blank(),
            strip.text.x=element_text(size=8)) +
    scale_y_continuous(expand = c(0,0)) +
    expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
    ylab("ligation control correlation") +
    xlab("sample"))
  return(corsamples)
}
ligcor = ligR2(eset)
## Warning in cor(x, pcdf$concentration): the standard deviation is zero
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
## Warning: Removed 1 rows containing missing values (geom_point).

dropsamples = subset(ligcor, correlation < 0.9)$sample

Ligation efficiency

Here we calculated ligation efficiency for the samples. This is the log of LIG_POS_A(i) / mean(LIG_POS_A) for each sample i. It doesn’t matter which direction this is in, it will get corrected for in the model. Again we can see most of the Zymo columns are outliers in this.

A small number of samples seem like the ligation step failed, so we can drop those from the analysis.

lignorm = function(eset, feature="Ligation_LIG_POS_A") {
  counts = exprs(eset)
  ligcounts = as.numeric(counts[grepl(feature, rownames(counts)),])
  lnorm = log2((ligcounts + 1) / mean(ligcounts))
  names(lnorm) = colnames(counts)
  return(lnorm)
}
metadata$lignorm = lignorm(eset)
metadata$ligscale = 2^(-metadata$lignorm)
ggplot(metadata, aes(sample, lignorm)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  ylab("ligation factor") +
  xlab("sample")

dropsamples = unique(c(dropsamples, subset(metadata, lignorm < -2.5)$sample))

Housekeeping

Some housekeeping genes are expressed more highly in some samples. Not sure what we are supposed to do with this information, if there are any suggestions we could easily implement it.

spotbarplot(extract_pred(eset, is_housekeeping))

Noise floor cutoff

We’ll normalize the libraries and look at the negative control expression and then come up with a cutoff that is above the noise floor for the libraries. It looks like 30 is a reasonable noise floor to use.

drop_unusable = function(counts) {
  drop = is_spikein(rownames(counts))
  drop = drop | is_positive(rownames(counts))
  drop = drop | is_housekeeping(rownames(counts))
  drop = drop | is_ligation(rownames(counts))
  keep = counts[!drop,]
  keep = keep[, !grepl("Blank", colnames(keep))]
  return(keep)
}
library(DESeq2)
dds = DESeqDataSetFromMatrix(drop_unusable(exprs(eset)), colData=metadata,
                             design=~sample)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = estimateSizeFactors(dds)
ncounts = data.frame(counts(dds, normalized=TRUE))
negcounts = extract_pred(ncounts, is_negative, counts=TRUE)
ggplot(extract_pred(ncounts, is_negative, counts=TRUE), aes(count)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

nfloor = 30

Drop control miRNA and non-expressed miRNA

filter_miRNA = function(counts) {
  drop = is_spikein(rownames(counts))
  drop = drop | is_positive(rownames(counts))
  drop = drop | is_negative(rownames(counts))
  drop = drop | is_housekeeping(rownames(counts))
  drop = drop | is_ligation(rownames(counts))
  drop = drop | (rowSums(counts > nfloor) < (0.2 * ncol(counts)))
  keep = counts[!drop,]
  keep = keep[, !grepl("Blank", colnames(keep))]
  return(keep)}
counts = exprs(eset)
counts = filter_miRNA(counts)
counts = counts[, ! colnames(counts) %in% dropsamples]
metadata = subset(metadata, sample %in% colnames(counts))

Is it okay to normalize by total library size?

One of the things that came up was whether or not it is okay to normalize the counts by total library size. The idea was that in some experimental conditions, it might be that there are overall more miRNA expression than other conditions, and if we normalize by library size we will lose that information.

If that is true, then we should expect to see different library sizes for different phenotypes. We test that here by calculating the library size for each phenotype and fitting a model of the library size dependent on phenotype. We see that the positive_tolerant samples have a higher library size, which might be indicative of more miRNA expression. So we can’t just normalize by total library size, or else we will lose that information.

cdf = data.frame(phenotype=metadata$phenotype, libsize=colSums(counts))
cdf$phenotype = relevel(cdf$phenotype, ref="negative_inactive")
fit = lm(libsize~phenotype, cdf)
summary(fit)
## 
## Call:
## lm(formula = libsize ~ phenotype, data = cdf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -99116 -38186 -20034  27008 213892 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   44003      12226   3.599 0.000462 ***
## phenotypeacute                -9193      17764  -0.517 0.605754    
## phenotypenegative_chronic      2438      16893   0.144 0.885466    
## phenotypenegative_unknown    -13042      18339  -0.711 0.478341    
## phenotypepositive_active       8656      17290   0.501 0.617538    
## phenotypepositive_tolerant    57861      19965   2.898 0.004452 ** 
## phenotypespontaneous           6832      17083   0.400 0.689925    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54680 on 122 degrees of freedom
## Multiple R-squared:  0.1062, Adjusted R-squared:  0.06223 
## F-statistic: 2.416 on 6 and 122 DF,  p-value: 0.03057
ggplot(cdf, aes(libsize)) + geom_histogram() + facet_wrap(~phenotype)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

But there are clearly variations within a class of the library size, so we have to do something to normalize it.

Does normalization for ligation effiency buy us anything?

Here we fit two models. The first model we fit the full model, including the ligation normalization term in the model. Then fit a reduced model without the lignorm term to answer the question ‘are there miRNA that are affected by the ligation efficiency?’ We already scaled the counts via the ligation normalization scale, so here we are asking are specific genes still affected.

full = ~spikein+lignorm+phenotype
reduced = ~spikein+phenotype
ncounts = round(counts * metadata$ligscale)
dds = DESeqDataSetFromMatrix(ncounts, colData=metadata,
                             design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds, full=full, reduced=reduced, test="LRT")
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
res = results(dds)
plotMA(res)

res = data.frame(res) %>% tibble::rownames_to_column() %>%
  arrange(padj) %>% filter(padj < 0.05)
knitr::kable(subset(res, padj < 0.05))
rowname baseMean log2FoldChange lfcSE stat pvalue padj
Endogenous1_hsa-miR-92a-3p|0_MIMAT0000092 70.34172 -0.3404734 0.2627347 31.987121 0.0000000 0.0000017
Endogenous1_hsa-miR-16-5p|0_MIMAT0000069 709.41678 0.5103207 0.3120318 27.752372 0.0000001 0.0000076
Endogenous1_hsa-miR-144-3p|0_MIMAT0000436 75.86848 -0.3216294 0.4332957 18.455379 0.0000174 0.0005605
Endogenous1_hsa-miR-181a-5p|0_MIMAT0000256 132.73355 0.3623254 0.3765303 18.153404 0.0000204 0.0005605
Endogenous1_hsa-miR-320e|0_MIMAT0015072 29.85206 -0.3645865 0.3604161 17.421117 0.0000299 0.0006589
Endogenous1_hsa-miR-23a-3p|0_MIMAT0000078 463.84048 0.1198514 0.3616441 17.023988 0.0000369 0.0006767
Endogenous1_hsa-miR-520f-3p|0_MIMAT0002830 56.09597 -0.1196358 0.3942447 16.548837 0.0000474 0.0007451
Endogenous1_hsa-miR-28-5p|0_MIMAT0000085 55.87426 -0.1561955 0.2447438 13.920096 0.0001907 0.0023314
Endogenous1_hsa-miR-98-5p|0_MIMAT0000096 76.38813 0.3065609 0.3062086 14.015916 0.0001813 0.0023314
Endogenous1_hsa-miR-28-3p|0_MIMAT0004502 20.92937 -0.0885269 0.2835116 12.701442 0.0003654 0.0040191
Endogenous1_hsa-miR-374a-5p|0_MIMAT0000727 162.56970 0.5843541 0.3283268 10.567013 0.0011512 0.0105530
Endogenous1_hsa-miR-148b-3p|0_MIMAT0000759 37.09184 -0.2132900 0.2782874 10.632835 0.0011110 0.0105530
Endogenous1_hsa-miR-425-5p|0_MIMAT0003393 27.28747 0.1493647 0.2225925 10.191958 0.0014105 0.0119354
Endogenous1_hsa-miR-194-5p|0_MIMAT0000460 33.88295 0.2167322 0.4289128 9.483772 0.0020730 0.0162876
Endogenous1_hsa-miR-612|0_MIMAT0003280 68.02116 -0.2486859 0.5189142 9.336776 0.0022460 0.0164707
Endogenous1_hsa-miR-340-5p|0_MIMAT0004692 46.51750 -0.0538673 0.2500769 9.056817 0.0026172 0.0179931
Endogenous1_hsa-miR-19b-3p|0_MIMAT0000074 77.25965 0.0642163 0.2733084 8.799971 0.0030124 0.0194917
Endogenous1_hsa-miR-223-3p|0_MIMAT0000280 2491.47725 0.3267386 0.4660603 8.675970 0.0032243 0.0197042
Endogenous1_hsa-miR-125b-5p|0_MIMAT0000423 69.14712 0.0472137 0.3927637 8.505802 0.0035402 0.0204957
Endogenous1_hsa-miR-199a-3p+hsa-miR-199b-3p|0_MIMAT0000232 408.66601 0.1798501 0.4091481 7.635129 0.0057242 0.0314833
Endogenous1_hsa-miR-185-5p|0_MIMAT0000455 44.03114 -0.1532540 0.2367752 6.973042 0.0082747 0.0433435
Endogenous1_hsa-miR-151a-3p|0_MIMAT0000757 51.64657 0.0638871 0.3115583 6.793692 0.0091481 0.0457403

PCA

It looks like maybe there is some separation along the second principal component for the samples labelled as positive vs the samples labelled as negative:

dds = DESeqDataSetFromMatrix(counts, colData=metadata, design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
vst = varianceStabilizingTransformation(dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
pca_loadings = function(object, ntop=500) {
  rv <- matrixStats::rowVars(assay(object))
  select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
      length(rv)))]
  pca <- prcomp(t(assay(object)[select, ]))
  percentVar <- pca$sdev^2/sum(pca$sdev^2)
  names(percentVar) = colnames(pca$x)
  pca$percentVar = percentVar
  return(pca)}
pc = pca_loadings(vst)
comps = data.frame(pc$x)
comps$Name = rownames(comps)
library(dplyr)
comps = comps %>% left_join(metadata, by=c("Name"="sample"))
pca_plot = function(comps, nc1, nc2, colorby) {
   c1str = paste0("PC", nc1)
   c2str = paste0("PC", nc2)
  ggplot(comps, aes_string(c1str, c2str, color=colorby)) +
    geom_point() + theme_bw() +
    xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), "% variance")) +
    ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 100), "% variance"))
  }
pca_plot(comps, 1, 2, "phenotype")

Differential expression

Here we ignore the miRNA-specific ligation normalization and just do what the NanoString folks suggested, scale the counts to remove the ligation effect for each sample.

write_results = function(res, fname) {
  res = data.frame(res) %>% tibble::rownames_to_column() %>%
    arrange(pvalue)
  write.table(res, file=fname, quote=FALSE, col.names=TRUE,
              row.names=FALSE, sep=",")
}
full = ~spikein+diseaseclass
ncounts = round(counts * metadata$ligscale)
dds = DESeqDataSetFromMatrix(ncounts, colData=metadata, design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(dds)

Active vs inactive

res = results(dds, contrast=list(c("diseaseclass1", "diseaseclass3", "diseaseclass5"),
                                 c("diseaseclass2", "diseaseclass4", "diseaseclass7")))
plotMA(res)

write_results(res, "active-vs-inactive.csv")

active vs inactive

Active vs inactive (chronic)

res = results(dds, contrast=list(c("diseaseclass3", "diseaseclass5"),
                                 c("diseaseclass2", "diseaseclass4")))
plotMA(res)

write_results(res, "active-vs-inactive-chronoic.csv")

active vs inactive (chronic)

Active vs inactive (acute)

res = results(dds, contrast=list(c("diseaseclass1"), c("diseaseclass7")))
plotMA(res)

write_results(res, "active-vs-inactive-acute.csv")

active vs inactive (acute)

Group 3 to 2

res = results(dds, contrast=list(c("diseaseclass3"), c("diseaseclass2")))
plotMA(res)

write_results(res, "group3-vs-group2.csv")

group3 vs group2

Group 5 to 4

res = results(dds, contrast=list(c("diseaseclass5"), c("diseaseclass4")))
plotMA(res)

write_results(res, "group5-vs-group4.csv")

group5 vs group4

Group 3 to 4

res = results(dds, contrast=list(c("diseaseclass3"), c("diseaseclass4")))
plotMA(res)

write_results(res, "group3-vs-group4.csv")

group3 vs group4

Group 3 to 7

res = results(dds, contrast=list(c("diseaseclass3"), c("diseaseclass7")))
plotMA(res)

write_results(res, "group3-vs-group7.csv")

group3 vs group7

Group 5 to 7

res = results(dds, contrast=list(c("diseaseclass5"), c("diseaseclass7")))
plotMA(res)

write_results(res, "group5-vs-group7.csv")

group5 vs group7